QOL Healthcare Utilization (VSURF and Analysis of Deviance)

Author

Miguel Fudolig

Code
library(tidyverse)
library(ggplot2)
library(car)
library(caret)
library(rFerns)
library(VSURF)
library(glmnet)
library(Boruta)
library(doParallel)
library(Hmisc)
library(mlbench)
library(pROC)

Setup

Code
qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |> 
  mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
         `English Speaking`=relevel(`English Speaking`,ref="Not at all"),
         Ethnicity = relevel(Ethnicity,ref="Chinese"),
         Religion=relevel(Religion,ref="None")) |> 
  mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
                                         "$10,000 - $19,999" ~"Below",
                                         "$20,000 - $29,999"~"Below",
                                         "$30,000 - $39,999"~"Below",
                                         "$40,000 - $49,999"~"Below",
                                         "$50,000 - $59,999"~"Below",
                                         "$60,000 - $69,999"~"Above",
                                         "$70,000 and over"~"Above",
                                          .default=Income)) |> 
  mutate(Income_median = factor(Income_median, levels=c("Below","Above"))) |> 
  mutate(across(`Familiarity with America`:`Familiarity with Ethnic Origin`,~factor(.x,levels=c("Very low","Low", "High", "Very high"))),
         across(`Identify Ethnically`,~factor(.x,levels=c("Not at all","Not very close","Somewhat close","Very close"))),
         across(`Belonging`,~factor(.x,levels=c("Not at all","Not very much","Somewhat","Very much"))),
         `Primary Language` = as.factor(`Primary Language`))
New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
Code
qol |> DT::datatable()
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

Unmet Health Need

VSURF Approach

Important

We will be using the Prediction Set to run an analysis of deviance to check model performance.

Code
rfdata <- qol |> select(`Unmet Health Need`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)
Code
rfdata |> gtsummary::tbl_summary(include=Unmet.Health.Need)
Characteristic N = 1,9741
Unmet.Health.Need
    0 1,770 (90%)
    Yes 204 (10%)
1 n (%)
Code
n_minority <- rfdata |> filter(Unmet.Health.Need=="Yes")|> nrow()

rf_options_for_vsurf <- list(
  sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
  strata = rfdata[,1],                     # Stratify by the response variable
  importance = TRUE                     # Ensure importance is calculated
)

VSURF(Unmet.Health.Need~.,
      rfdata,
      na.action="na.omit",
      parallel=T,verbose=F,
      rf.options=rf_options_for_vsurf)->vsurf.mod
Warning in VSURF.formula(Unmet.Health.Need ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 10 secs 

 VSURF selected: 
    18 variables at thresholding step (in 4.5 secs)
    6 variables at interpretation step (in 2.7 secs)
    1 variables at prediction step (in 2.8 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "English.Speaking"
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "English.Speaking"      "Duration.of.Residency" "Age"                  
[4] "Ethnicity"             "Discrimination"        "English.Difficulties" 
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.1030142

Importance

Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                English.Speaking    0.01267       0.00099 black
2           Duration.of.Residency    0.01083       0.00048 black
3                             Age    0.00982       0.00042 black
4                       Ethnicity    0.00891       0.00052   red
5                  Discrimination    0.00765       0.00041 black
6            English.Difficulties    0.00645       0.00031 black
7                Dental.Insurance    0.00621       0.00063 black
8                        Religion    0.00530       0.00039 black
9            Full.Time.Employment    0.00274       0.00021 black
10                      Belonging    0.00246       0.00036 black
11            Identify.Ethnically    0.00218       0.00037 black
12                  Income_median    0.00215       0.00026 black
13               Health.Insurance    0.00209       0.00035 black
14               Primary.Language    0.00192       0.00020 black
15       Familiarity.with.America    0.00168       0.00033 black
16                        US.Born    0.00157       0.00014 black
17 Familiarity.with.Ethnic.Origin    0.00072       0.00026 black
18                         Gender    0.00055       0.00022 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_unmethealth.png", width=6, height=4.5,units="in")

Analysis of Deviance Using the Prediction Set

Code
all_formula <- Unmet.Health.Need~.
pred_vars <- names(rfdata[,-1])[vsurf.mod$varselect.interp]


pred_vars <- if("Ethnicity" %in% pred_vars){
  pred_vars
} else {
  c(pred_vars,"Ethnicity")
  }
mod_form <- reformulate(pred_vars, response="Unmet.Health.Need")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Unmet.Health.Need")

options(contrasts = c("contr.sum","contr.poly"))
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Unmet.Health.Need ~ English.Speaking + Duration.of.Residency + 
    Age + Discrimination + English.Difficulties
Model 2: Unmet.Health.Need ~ English.Speaking + Duration.of.Residency + 
    Age + Ethnicity + Discrimination + English.Difficulties
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)   
1      1964     1207.5                        
2      1959     1187.4  5   20.067 0.001214 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
performance::check_model(mod1)

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1      1301.232         1283.36
Code
car::Anova(mod1, test.statistic = "F")
Analysis of Deviance Table (Type II tests)

Response: Unmet.Health.Need
Error estimate based on Pearson residuals 

                       Sum Sq   Df F value    Pr(>F)    
English.Speaking        35.80    3 12.1296 7.258e-08 ***
Duration.of.Residency    0.08    1  0.0767  0.781867    
Age                      0.76    1  0.7678  0.380994    
Ethnicity               20.07    5  4.0792  0.001095 ** 
Discrimination          28.31    1 28.7749 9.092e-08 ***
English.Difficulties     5.76    3  1.9529  0.119068    
Residuals             1927.39 1959                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = rfdata)

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -2.016054   0.274226  -7.352 1.96e-13 ***
English.Speaking1      1.029224   0.237976   4.325 1.53e-05 ***
English.Speaking2      0.439003   0.148990   2.947  0.00321 ** 
English.Speaking3     -1.058307   0.210024  -5.039 4.68e-07 ***
Duration.of.Residency -0.002485   0.009047  -0.275  0.78362    
Age                   -0.005818   0.006725  -0.865  0.38698    
Ethnicity1            -0.276237   0.177019  -1.560  0.11864    
Ethnicity2            -0.317491   0.225698  -1.407  0.15951    
Ethnicity3             0.244252   0.254044   0.961  0.33632    
Ethnicity4             0.191657   0.171791   1.116  0.26458    
Ethnicity5            -0.403333   0.367192  -1.098  0.27202    
Discrimination         0.862615   0.161124   5.354 8.62e-08 ***
English.Difficulties1 -0.069735   0.163448  -0.427  0.66963    
English.Difficulties2  0.101377   0.142650   0.711  0.47729    
English.Difficulties3 -0.271418   0.139367  -1.948  0.05147 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1312.2  on 1973  degrees of freedom
Residual deviance: 1187.4  on 1959  degrees of freedom
AIC: 1217.4

Number of Fisher Scoring iterations: 5
Code
mod1 |>  emmeans::emmeans(~Ethnicity, type="response")
 Ethnicity      prob     SE  df asymp.LCL asymp.UCL
 Chinese      0.1050 0.0153 Inf    0.0786     0.139
 Asian Indian 0.1012 0.0237 Inf    0.0633     0.158
 Filipino     0.1649 0.0406 Inf    0.0998     0.260
 Korean       0.1578 0.0226 Inf    0.1184     0.207
 Other        0.0937 0.0370 Inf    0.0421     0.196
 Vietnamese   0.2133 0.0259 Inf    0.1669     0.269

Results are averaged over the levels of: English.Speaking, Discrimination, English.Difficulties 
Confidence level used: 0.95 
Intervals are back-transformed from the logit scale 
Code
mod1 |>  emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff")
 contrast            estimate    SE  df z.ratio p.value
 Chinese effect        -0.276 0.177 Inf  -1.560  0.3190
 Asian Indian effect   -0.317 0.226 Inf  -1.407  0.3190
 Filipino effect        0.244 0.254 Inf   0.961  0.3363
 Korean effect          0.192 0.172 Inf   1.116  0.3264
 Other effect          -0.403 0.367 Inf  -1.098  0.3264
 Vietnamese effect      0.561 0.161 Inf   3.484  0.0030

Results are averaged over the levels of: English.Speaking, Discrimination, English.Difficulties 
Results are given on the log odds ratio (not the response) scale. 
P value adjustment: fdr method for 6 tests 

Unmet Dental Need

VSURF Approach

Code
rfdata <- qol |> select(`Unmet Dental Needs`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)
Code
rfdata |> gtsummary::tbl_summary(include=Unmet.Dental.Needs)
Characteristic N = 1,9681
Unmet.Dental.Needs
    0 1,744 (89%)
    Yes 224 (11%)
1 n (%)
Code
rfdata |> gtsummary::tbl_summary(include=Unmet.Dental.Needs)
Characteristic N = 1,9681
Unmet.Dental.Needs
    0 1,744 (89%)
    Yes 224 (11%)
1 n (%)
Code
n_minority <- rfdata |> filter(Unmet.Dental.Needs=="Yes")|> nrow()

rf_options_for_vsurf <- list(
  sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
  strata = rfdata[,1],                     # Stratify by the response variable
  importance = TRUE                     # Ensure importance is calculated
)

VSURF(Unmet.Dental.Needs~.,
      rfdata,
      na.action="na.omit",
      parallel=T,verbose=F,
      rf.options=rf_options_for_vsurf)->vsurf.mod
Warning in VSURF.formula(Unmet.Dental.Needs ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 7.7 secs 

 VSURF selected: 
    17 variables at thresholding step (in 4.6 secs)
    1 variables at interpretation step (in 2.6 secs)
    1 variables at prediction step (in 0.5 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Dental.Insurance"
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Dental.Insurance"
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.1148374

Importance

Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                Dental.Insurance    0.01362       0.00081 black
2                             Age    0.00999       0.00033 black
3                English.Speaking    0.00930       0.00070 black
4           Duration.of.Residency    0.00875       0.00048 black
5                        Religion    0.00734       0.00065 black
6                   Income_median    0.00542       0.00059 black
7                       Ethnicity    0.00395       0.00052   red
8        Familiarity.with.America    0.00384       0.00046 black
9                Primary.Language    0.00349       0.00035 black
10           English.Difficulties    0.00328       0.00052 black
11           Full.Time.Employment    0.00185       0.00028 black
12                        US.Born    0.00146       0.00018 black
13 Familiarity.with.Ethnic.Origin    0.00145       0.00032 black
14               Health.Insurance    0.00137       0.00034 black
15                 Discrimination    0.00107       0.00026 black
16                      Belonging    0.00077       0.00036 black
17            Identify.Ethnically    0.00063       0.00038 black
18                         Gender   -0.00015       0.00020 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_unmetdental.png", width=6, height=4.5,units="in")
Note

Dental Insurance is the only variable in the Prediction Set, which means Ethnicity might not be a significant predictor of unmet dental needs.

Analysis of Deviance

Code
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Unmet.Dental.Needs")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Unmet.Dental.Needs")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Unmet.Dental.Needs ~ Dental.Insurance
Model 2: Unmet.Dental.Needs ~ Dental.Insurance + Ethnicity
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1      1966     1300.0                     
2      1961     1291.3  5   8.7216   0.1207
Code
performance::check_model(mod1)

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1       1344.41        1315.208
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = rfdata)

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)       -2.11153    0.09115 -23.165   <2e-16 ***
Dental.Insurance1  0.69784    0.07851   8.889   <2e-16 ***
Ethnicity1        -0.12599    0.15545  -0.810   0.4177    
Ethnicity2        -0.31150    0.17526  -1.777   0.0755 .  
Ethnicity3         0.17410    0.21838   0.797   0.4253    
Ethnicity4         0.23328    0.14812   1.575   0.1153    
Ethnicity5        -0.17960    0.30374  -0.591   0.5543    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1395.0  on 1967  degrees of freedom
Residual deviance: 1291.3  on 1961  degrees of freedom
AIC: 1305.3

Number of Fisher Scoring iterations: 5
Code
mod1 |>  emmeans::emmeans(~Ethnicity, type="response")
 Ethnicity      prob     SE  df asymp.LCL asymp.UCL
 Chinese      0.0964 0.0138 Inf    0.0726     0.127
 Asian Indian 0.0814 0.0139 Inf    0.0580     0.113
 Filipino     0.1259 0.0266 Inf    0.0823     0.188
 Korean       0.1326 0.0170 Inf    0.1026     0.170
 Other        0.0919 0.0297 Inf    0.0480     0.169
 Vietnamese   0.1299 0.0176 Inf    0.0991     0.168

Results are averaged over the levels of: Dental.Insurance 
Confidence level used: 0.95 
Intervals are back-transformed from the logit scale 
Code
mod1 |>  emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff")
 contrast            estimate    SE  df z.ratio p.value
 Chinese effect        -0.126 0.155 Inf  -0.810  0.5104
 Asian Indian effect   -0.311 0.175 Inf  -1.777  0.3458
 Filipino effect        0.174 0.218 Inf   0.797  0.5104
 Korean effect          0.233 0.148 Inf   1.575  0.3458
 Other effect          -0.180 0.304 Inf  -0.591  0.5543
 Vietnamese effect      0.210 0.154 Inf   1.361  0.3469

Results are averaged over the levels of: Dental.Insurance 
Results are given on the log odds ratio (not the response) scale. 
P value adjustment: fdr method for 6 tests 
Note

The non-significant analysis of deviance implies that Ethnicity might not be a significant predictor of unmet dental needs.

Physical Check-up

VSURF

Code
rfdata <- qol |> 
  select(`Physical Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)
Code
rfdata |> gtsummary::tbl_summary(include=Physical.Check.up)
Characteristic N = 1,9661
Physical.Check.up
    0 630 (32%)
    Yes 1,336 (68%)
1 n (%)
Code
rfdata |> gtsummary::tbl_summary(include=Physical.Check.up)
Characteristic N = 1,9661
Physical.Check.up
    0 630 (32%)
    Yes 1,336 (68%)
1 n (%)
Code
n_minority <- rfdata |> filter(Physical.Check.up=="0")|> nrow()

rf_options_for_vsurf <- list(
  sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
  strata = rfdata[,1],                     # Stratify by the response variable
  importance = TRUE                     # Ensure importance is calculated
)

VSURF(Physical.Check.up~.,
      rfdata,
      na.action="na.omit",
      parallel=T,verbose=F,
      rf.options=rf_options_for_vsurf)->vsurf.mod
Warning in VSURF.formula(Physical.Check.up ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 11.1 secs 

 VSURF selected: 
    17 variables at thresholding step (in 5.4 secs)
    5 variables at interpretation step (in 3 secs)
    2 variables at prediction step (in 2.7 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Duration.of.Residency" "Health.Insurance"     
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Duration.of.Residency" "Age"                   "Health.Insurance"     
[4] "Dental.Insurance"      "Income_median"        
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.282528
Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1           Duration.of.Residency    0.03000       0.00073 black
2                             Age    0.02716       0.00131 black
3                Health.Insurance    0.02245       0.00044 black
4                Dental.Insurance    0.01348       0.00057 black
5                   Income_median    0.00728       0.00052 black
6                       Ethnicity    0.00654       0.00060   red
7                    EnglishSpeak    0.00549       0.00068 black
8                      Employment    0.00464       0.00039 black
9                     EnglishDiff    0.00458       0.00054 black
10                       Religion    0.00408       0.00052 black
11       Familiarity.with.America    0.00251       0.00037 black
12 Familiarity.with.Ethnic.Origin    0.00238       0.00045 black
13                         Gender    0.00232       0.00056 black
14               Primary.Language    0.00109       0.00030 black
15                        US.Born    0.00085       0.00021 black
16                 Discrimination    0.00060       0.00031 black
17            Identify.Ethnically    0.00050       0.00033 black
18                      Belonging    0.00021       0.00036 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_PC.png", width=6, height=4.5,units="in")

Analysis of Deviance

Code
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Physical.Check.up")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Physical.Check.up")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Physical.Check.up ~ Duration.of.Residency + Age + Health.Insurance + 
    Dental.Insurance + Income_median
Model 2: Physical.Check.up ~ Duration.of.Residency + Age + Health.Insurance + 
    Dental.Insurance + Income_median + Ethnicity
  Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
1      1960     2152.4                          
2      1955     2115.6  5   36.766 6.673e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
performance::check_model(mod1)

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1      2199.048        2197.895
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = rfdata)

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -0.912314   0.163698  -5.573 2.50e-08 ***
Duration.of.Residency  0.032732   0.005457   5.998 2.00e-09 ***
Age                    0.019095   0.003837   4.976 6.48e-07 ***
Health.Insurance1     -0.595463   0.080852  -7.365 1.77e-13 ***
Dental.Insurance1     -0.272499   0.063466  -4.294 1.76e-05 ***
Income_median1        -0.185135   0.059851  -3.093  0.00198 ** 
Ethnicity1            -0.037393   0.108227  -0.346  0.72971    
Ethnicity2             0.059656   0.116050   0.514  0.60722    
Ethnicity3             0.397812   0.165291   2.407  0.01610 *  
Ethnicity4            -0.524335   0.112804  -4.648 3.35e-06 ***
Ethnicity5            -0.297137   0.192876  -1.541  0.12342    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2466.2  on 1965  degrees of freedom
Residual deviance: 2115.6  on 1955  degrees of freedom
AIC: 2137.6

Number of Fisher Scoring iterations: 4
Code
mod1 |>  emmeans::emmeans(~Ethnicity, type="response")
 Ethnicity     prob     SE  df asymp.LCL asymp.UCL
 Chinese      0.589 0.0295 Inf     0.531     0.646
 Asian Indian 0.613 0.0311 Inf     0.550     0.672
 Filipino     0.689 0.0417 Inf     0.602     0.765
 Korean       0.469 0.0316 Inf     0.408     0.531
 Other        0.525 0.0573 Inf     0.414     0.635
 Vietnamese   0.690 0.0311 Inf     0.626     0.748

Results are averaged over the levels of: Health.Insurance, Dental.Insurance, Income_median 
Confidence level used: 0.95 
Intervals are back-transformed from the logit scale 
Code
mod1 |>  emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff")
 contrast            estimate    SE  df z.ratio p.value
 Chinese effect       -0.0374 0.108 Inf  -0.346  0.7297
 Asian Indian effect   0.0597 0.116 Inf   0.514  0.7287
 Filipino effect       0.3978 0.165 Inf   2.407  0.0322
 Korean effect        -0.5243 0.113 Inf  -4.648  <.0001
 Other effect         -0.2971 0.193 Inf  -1.541  0.1851
 Vietnamese effect     0.4014 0.127 Inf   3.151  0.0049

Results are averaged over the levels of: Health.Insurance, Dental.Insurance, Income_median 
Results are given on the log odds ratio (not the response) scale. 
P value adjustment: fdr method for 6 tests 

Dental Check-up

VSURF

Code
rfdata <- qol |> 
  select(`Dentist Check-up`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)
Code
rfdata |> gtsummary::tbl_summary(include=Dentist.Check.up)
Characteristic N = 1,9611
Dentist.Check.up
    0 811 (41%)
    Yes 1,150 (59%)
1 n (%)
Code
rfdata |>  gtsummary::tbl_summary(include=Dentist.Check.up)
Characteristic N = 1,9611
Dentist.Check.up
    0 811 (41%)
    Yes 1,150 (59%)
1 n (%)
Code
n_minority <- rfdata |> filter(Dentist.Check.up=="0") |> nrow()

rf_options_for_vsurf <- list(
  sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
  strata = rfdata[,1],                     # Stratify by the response variable
  importance = TRUE                     # Ensure importance is calculated
)

VSURF(Dentist.Check.up~.,
      rfdata,
      na.action="na.omit",
      parallel=T,verbose=F,
      rf.options=rf_options_for_vsurf)->vsurf.mod
Warning in VSURF.formula(Dentist.Check.up ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 10.2 secs 

 VSURF selected: 
    18 variables at thresholding step (in 5.4 secs)
    3 variables at interpretation step (in 3.2 secs)
    2 variables at prediction step (in 1.6 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Dental.Insurance"      "Duration.of.Residency"
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Dental.Insurance"      "Duration.of.Residency" "Religion"             
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.295793
Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                Dental.Insurance    0.05197       0.00073 black
2           Duration.of.Residency    0.04911       0.00074 black
3                        Religion    0.01538       0.00083 black
4                       Ethnicity    0.01337       0.00042   red
5                             Age    0.01120       0.00076 black
6                Health.Insurance    0.00834       0.00048 black
7                    EnglishSpeak    0.00778       0.00049 black
8                   Income_median    0.00686       0.00044 black
9             Identify.Ethnically    0.00370       0.00042 black
10                    EnglishDiff    0.00321       0.00066 black
11       Familiarity.with.America    0.00274       0.00046 black
12 Familiarity.with.Ethnic.Origin    0.00261       0.00045 black
13                      Belonging    0.00207       0.00045 black
14                     Employment    0.00200       0.00028 black
15                         Gender    0.00197       0.00042 black
16                        US.Born    0.00176       0.00021 black
17               Primary.Language    0.00166       0.00025 black
18                 Discrimination    0.00073       0.00022 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_DC.png", width=6, height=4.5,units="in")

Analysis of Deviance

Code
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Dentist.Check.up")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Dentist.Check.up")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Dentist.Check.up ~ Dental.Insurance + Duration.of.Residency + 
    Religion
Model 2: Dentist.Check.up ~ Dental.Insurance + Duration.of.Residency + 
    Religion + Ethnicity
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1      1952     2209.7                     
2      1947     2201.0  5   8.7418   0.1198
Code
performance::check_model(mod1)

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1      2307.098        2277.933
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = rfdata)

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -0.542307   0.115283  -4.704 2.55e-06 ***
Dental.Insurance1     -0.804673   0.054352 -14.805  < 2e-16 ***
Duration.of.Residency  0.045686   0.004692   9.738  < 2e-16 ***
Religion1              0.098471   0.158743   0.620    0.535    
Religion2              0.344886   0.178252   1.935    0.053 .  
Religion3              0.225075   0.166045   1.356    0.175    
Religion4             -0.193586   0.240057  -0.806    0.420    
Religion5             -0.152920   0.312546  -0.489    0.625    
Religion6             -0.423634   0.345307  -1.227    0.220    
Ethnicity1             0.272980   0.132916   2.054    0.040 *  
Ethnicity2            -0.225308   0.241135  -0.934    0.350    
Ethnicity3             0.197420   0.177079   1.115    0.265    
Ethnicity4            -0.141224   0.137876  -1.024    0.306    
Ethnicity5            -0.045418   0.202181  -0.225    0.822    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2659.6  on 1960  degrees of freedom
Residual deviance: 2201.0  on 1947  degrees of freedom
AIC: 2229

Number of Fisher Scoring iterations: 4
Code
mod1 |>  emmeans::emmeans(~Ethnicity, type="response")
 Ethnicity     prob     SE  df asymp.LCL asymp.UCL
 Chinese      0.612 0.0388 Inf     0.534     0.685
 Asian Indian 0.489 0.0555 Inf     0.383     0.597
 Filipino     0.594 0.0520 Inf     0.489     0.690
 Korean       0.510 0.0431 Inf     0.426     0.594
 Other        0.534 0.0590 Inf     0.419     0.646
 Vietnamese   0.531 0.0415 Inf     0.450     0.611

Results are averaged over the levels of: Dental.Insurance, Religion 
Confidence level used: 0.95 
Intervals are back-transformed from the logit scale 
Code
mod1 |>  emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff", type="response") |> summary(infer=T)
 contrast            odds.ratio    SE  df asymp.LCL asymp.UCL null z.ratio
 Chinese effect           1.314 0.175 Inf     0.925      1.87    1   2.054
 Asian Indian effect      0.798 0.192 Inf     0.423      1.51    1  -0.934
 Filipino effect          1.218 0.216 Inf     0.764      1.94    1   1.115
 Korean effect            0.868 0.120 Inf     0.604      1.25    1  -1.024
 Other effect             0.956 0.193 Inf     0.561      1.63    1  -0.225
 Vietnamese effect        0.943 0.130 Inf     0.656      1.36    1  -0.425
 p.value
  0.2400
  0.5252
  0.5252
  0.5252
  0.8223
  0.8048

Results are averaged over the levels of: Dental.Insurance, Religion 
Confidence level used: 0.95 
Conf-level adjustment: bonferroni method for 6 estimates 
Intervals are back-transformed from the log odds ratio scale 
P value adjustment: fdr method for 6 tests 
Tests are performed on the log odds ratio scale 

Folk medicine

VSURF

Code
rfdata <- qol |> select(`Folkmedicine`,Ethnicity, Age, Gender,Religion, `Full Time Employment`,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)
Code
rfdata |> gtsummary::tbl_summary(include=Folkmedicine)
Characteristic N = 1,9471
Folkmedicine
    0 1,681 (86%)
    Yes 266 (14%)
1 n (%)
Code
rfdata |> gtsummary::tbl_summary(include=Folkmedicine)
Characteristic N = 1,9471
Folkmedicine
    0 1,681 (86%)
    Yes 266 (14%)
1 n (%)
Code
n_minority <- rfdata |> filter(Folkmedicine=="Yes") |> nrow()

rf_options_for_vsurf <- list(
  sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
  strata = rfdata[,1],                     # Stratify by the response variable
  importance = TRUE                     # Ensure importance is calculated
)

VSURF(Folkmedicine~.,
      rfdata,
      na.action="na.omit",
      parallel=T,
      verbose=F,
      rf.options=rf_options_for_vsurf)->vsurf.mod
Warning in VSURF.formula(Folkmedicine ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 9 secs 

 VSURF selected: 
    18 variables at thresholding step (in 4.6 secs)
    3 variables at interpretation step (in 2.8 secs)
    2 variables at prediction step (in 1.5 secs)

 VSURF ran in parallel on a PSOCK cluster and used 15 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Age"       "Ethnicity"
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Age"                   "Duration.of.Residency" "Ethnicity"            
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.1383924

Importance

Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                             Age    0.01739       0.00062 black
2           Duration.of.Residency    0.01355       0.00065 black
3                       Ethnicity    0.01320       0.00073   red
4                English.Speaking    0.00902       0.00074 black
5                        Religion    0.00710       0.00070 black
6                   Income_median    0.00562       0.00043 black
7            English.Difficulties    0.00480       0.00055 black
8        Familiarity.with.America    0.00270       0.00046 black
9                Primary.Language    0.00259       0.00029 black
10 Familiarity.with.Ethnic.Origin    0.00259       0.00047 black
11           Full.Time.Employment    0.00241       0.00032 black
12               Dental.Insurance    0.00117       0.00035 black
13                 Discrimination    0.00112       0.00028 black
14               Health.Insurance    0.00086       0.00014 black
15                        US.Born    0.00071       0.00017 black
16            Identify.Ethnically    0.00069       0.00034 black
17                         Gender    0.00050       0.00031 black
18                      Belonging    0.00023       0.00042 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_AltMedicine.png", width=6, height=4.5,units="in")

Analysis of Deviance

Code
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.interp])
mod_form <- reformulate(pred_vars, response="Folkmedicine")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Folkmedicine")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Folkmedicine ~ Age + Duration.of.Residency
Model 2: Folkmedicine ~ Age + Duration.of.Residency + Ethnicity
  Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
1      1944     1507.7                          
2      1939     1450.9  5    56.75 5.693e-11 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
performance::check_model(mod1)

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1      1511.497        1530.377
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = rfdata)

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -3.135528   0.204648 -15.322  < 2e-16 ***
Age                    0.022611   0.004636   4.877 1.08e-06 ***
Duration.of.Residency  0.007417   0.005960   1.244   0.2134    
Ethnicity1             0.522926   0.131708   3.970 7.18e-05 ***
Ethnicity2            -0.286686   0.173196  -1.655   0.0979 .  
Ethnicity3            -0.219541   0.217195  -1.011   0.3121    
Ethnicity4             0.757469   0.133299   5.682 1.33e-08 ***
Ethnicity5            -0.212500   0.287863  -0.738   0.4604    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1552.9  on 1946  degrees of freedom
Residual deviance: 1450.9  on 1939  degrees of freedom
AIC: 1466.9

Number of Fisher Scoring iterations: 5
Code
mod1 |>  emmeans::emmeans(~Ethnicity, type="response")
 Ethnicity      prob     SE  df asymp.LCL asymp.UCL
 Chinese      0.1741 0.0176 Inf    0.1422    0.2113
 Asian Indian 0.0857 0.0142 Inf    0.0617    0.1180
 Filipino     0.0912 0.0203 Inf    0.0585    0.1394
 Korean       0.2104 0.0208 Inf    0.1725    0.2541
 Other        0.0917 0.0280 Inf    0.0496    0.1634
 Vietnamese   0.0665 0.0125 Inf    0.0459    0.0955

Confidence level used: 0.95 
Intervals are back-transformed from the logit scale 
Code
mod1 |>  emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff", type="response") |> summary(infer=T)
 contrast            odds.ratio    SE  df asymp.LCL asymp.UCL null z.ratio
 Chinese effect           1.687 0.222 Inf     1.192     2.388    1   3.970
 Asian Indian effect      0.751 0.130 Inf     0.475     1.186    1  -1.655
 Filipino effect          0.803 0.174 Inf     0.453     1.424    1  -1.011
 Korean effect            2.133 0.284 Inf     1.500     3.032    1   5.682
 Other effect             0.809 0.233 Inf     0.378     1.728    1  -0.738
 Vietnamese effect        0.570 0.105 Inf     0.351     0.927    1  -3.053
 p.value
  0.0002
  0.1468
  0.3745
  <.0001
  0.4604
  0.0045

Confidence level used: 0.95 
Conf-level adjustment: bonferroni method for 6 estimates 
Intervals are back-transformed from the log odds ratio scale 
P value adjustment: fdr method for 6 tests 
Tests are performed on the log odds ratio scale